In [2]:
%matplotlib inline
import matplotlib.pyplot as pl
import numpy as np
# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()
In [7]:
from IPython.parallel import Client
client = Client()
In [8]:
len(client)
Out[8]:
In [9]:
%px print("Hello from the cluster engines!")
In [10]:
def where_am_i():
import os
import socket
return "In process with pid {0} on host: '{1}'".format(
os.getpid(), socket.gethostname())
In [11]:
where_am_i()
Out[11]:
In [12]:
direct_view = client.direct_view()
In [13]:
where_am_i_direct_results = direct_view.apply(where_am_i)
where_am_i_direct_results
Out[13]:
In [14]:
where_am_i_direct_results.get()
Out[14]:
In [15]:
where_am_i_direct_results.get_dict()
Out[15]:
In [16]:
lb_view = client.load_balanced_view()
In [17]:
where_am_i_lb_result = lb_view.apply(where_am_i)
where_am_i_lb_result
Out[17]:
In [18]:
where_am_i_lb_result.get()
Out[18]:
In [19]:
from pyrallel import mmap_utils, model_selection
_ = reload(mmap_utils), reload(model_selection)
In [20]:
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler
digits = load_digits()
X = MinMaxScaler().fit_transform(digits.data)
y = digits.target
In [21]:
digits_cv_split_filenames = mmap_utils.persist_cv_splits(
X, y, name='digits_10', n_cv_iter=10)
digits_cv_split_filenames
Out[21]:
In [22]:
mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames)
In [23]:
from sklearn.svm import LinearSVC
from collections import OrderedDict
import numpy as np
linear_svc_params = OrderedDict((
('C', np.logspace(-2, 2, 5)),
))
linear_svc = LinearSVC()
In [24]:
linear_svc_search = model_selection.RandomizedGridSeach(lb_view)
linear_svc_search.launch_for_splits(
linear_svc,
linear_svc_params,
digits_cv_split_filenames)
Out[24]:
In [25]:
linear_svc_search.boxplot_parameters(display_train=False)
In [26]:
x = np.linspace(0, int(1e3), 100)
pl.plot(x, x ** 3 / 1e9)
pl.xlabel("Number of training samples")
pl.ylabel("Estimated Convergence Time of SMO (in seconds)")
Out[26]:
In [27]:
1e6 ** 3 / 1e9 / 60 / 60 / 24 / 365
Out[27]:
In [28]:
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline
nystroem_pipeline = Pipeline([
('nystroem', Nystroem()),
('clf', LinearSVC()),
])
In [29]:
nystroem_pipeline_params = OrderedDict((
('nystroem__n_components', [50, 100, 200]),
('nystroem__gamma', np.logspace(-2, 2, 5)),
('clf__C', np.logspace(-2, 2, 5)),
))
In [30]:
nystroem_search = model_selection.RandomizedGridSeach(lb_view)
In [31]:
nystroem_search.launch_for_splits(nystroem_pipeline, nystroem_pipeline_params, digits_cv_split_filenames)
Out[31]:
In [34]:
nystroem_search
Out[34]:
In [35]:
nystroem_search.boxplot_parameters()
In [ ]:
nystroem_search.reset()
In this example we used LinearSVC that does not provide a partial_fit method hence require to put the Nystroem expansion of complet dataset in memory. Furthermore the Pipeline object does not optimize the memory usage.
To make this example really scalable we would need to:
partial_fit to sklearn.pipeline.Pipelinepartial_fit method with small minibatches in the inner model evaluation function.